Path

data_dir <- "C:/Users/eliven/Dropbox/ELLW_2026/code/datasets/Figure"

fre   <- file.path(data_dir, "user_query_frequency_jan.csv")

hourly_info <- file.path(data_dir, "hourly_info.csv")

info_aware_path       <- file.path(data_dir, "info_aware_ratio_by_group.csv")
info_acquisition_path <- file.path(data_dir, "info_acquisition_ratio_by_group.csv")
info_integration_path <- file.path(data_dir, "info_integration_ratio_by_group.csv")

Figures in the main paper

Figure 1

# ------------------------------------------------------------------
# Read hourly-level query data
# ------------------------------------------------------------------
data <- read.csv(hourly_info, stringsAsFactors = FALSE)

# ------------------------------------------------------------------
# Compute density of three query types by half-hour
# ------------------------------------------------------------------
data_density <- data %>%
  mutate(
    # Total counts across the full day (used for normalization)
    total_awareness = sum(Awareness, na.rm = TRUE),
    total_acquisition = sum(Acquisition, na.rm = TRUE),
    total_integration = sum(Integration, na.rm = TRUE)
  ) %>%
  group_by(half_hour) %>%
  summarize(
    # Density within each half-hour relative to daily totals
    awareness_density = sum(Awareness, na.rm = TRUE) / first(total_awareness),
    acquisition_density = sum(Acquisition, na.rm = TRUE) / first(total_acquisition),
    integration_density = sum(Integration, na.rm = TRUE) / first(total_integration)
  )

# ------------------------------------------------------------------
# Convert data to long format for ggplot
# ------------------------------------------------------------------
data_long_density <- tidyr::pivot_longer(
  data_density, 
  cols = c(awareness_density, acquisition_density, integration_density), 
  names_to = "Type", 
  values_to = "Density"
) %>%
  mutate(Density = Density * 100)

# Set ordering of query types for legend and aesthetics
data_long_density <- data_long_density %>%
  mutate(
    Type = factor(Type, levels = c(
      "awareness_density",
      "acquisition_density",
      "integration_density"
    ))
  )

# ------------------------------------------------------------------
# Define trading hour time points (in hours)
# ------------------------------------------------------------------
open_time <- 9.5
break_time <- 11.5
afternoon_open_time <- 13
close_time <- 15

# ------------------------------------------------------------------
# Construct density plot
# ------------------------------------------------------------------
density_plot <- ggplot(
  data_long_density,
  aes(x = half_hour, y = Density, color = Type, linetype = Type)
) +
  # Highlight trading hours
  annotate("rect", xmin = open_time, xmax = break_time,
           ymin = -Inf, ymax = Inf, alpha = 0.2, fill = "lightblue") +
  annotate("rect", xmin = afternoon_open_time, xmax = close_time,
           ymin = -Inf, ymax = Inf, alpha = 0.2, fill = "lightblue") +

  # Vertical reference lines for market open and close
  geom_vline(xintercept = open_time, color = "black",
             linetype = "longdash", size = 0.8) +
  geom_vline(xintercept = close_time, color = "black",
             linetype = "longdash", size = 0.8) +

  # Dashed vertical markers for lunch break
  annotate("segment", x = break_time, xend = break_time,
           y = -Inf, yend = 1.2,
           color = "black", linetype = "longdash", size = 0.8) +
  annotate("segment", x = break_time, xend = break_time,
           y = 1.8, yend = Inf,
           color = "black", linetype = "longdash", size = 0.8) +
  annotate("segment", x = afternoon_open_time, xend = afternoon_open_time,
           y = -Inf, yend = 1.2,
           color = "black", linetype = "longdash", size = 0.8) +
  annotate("segment", x = afternoon_open_time, xend = afternoon_open_time,
           y = 1.8, yend = Inf,
           color = "black", linetype = "longdash", size = 0.8) +

  # Trading hours label
  annotate(
    "text",
    x = (open_time + close_time) / 2,
    y = 1.5,
    label = "Trading\nHours",
    color = "black",
    family = "Times New Roman",
    size = 9,
    fontface = "bold",
    hjust = 0.5
  ) +

  # Line plot
  geom_line(size = 1.2) +

  # Axis labels
  labs(
    x = "Hour of the Day",
    y = "Density of Queries"
  ) +

  # Manual color scale
  scale_color_manual(
    values = c(
      "awareness_density" = "#0053c8",
      "acquisition_density" = "#c11023",
      "integration_density" = "#008000"
    ),
    labels = c(
      "awareness_density" = "Awareness",
      "acquisition_density" = "Acquisition",
      "integration_density" = "Integration"
    )
  ) +

  # Manual linetype scale
  scale_linetype_manual(
    values = c(
      "awareness_density" = "solid",
      "acquisition_density" = "longdash",
      "integration_density" = "dotdash"
    ),
    labels = c(
      "awareness_density" = "Awareness",
      "acquisition_density" = "Acquisition",
      "integration_density" = "Integration"
    )
  ) +

  # Axis scales
  scale_x_continuous(breaks = seq(0, 24, 2), limits = c(0, 24)) +
  scale_y_continuous(breaks = seq(0, 4, 0.5), limits = c(0, 4)) +

  # Theme settings
  theme_classic(base_size = 16) +
  theme(
    text = element_text(family = "Times New Roman"),
    axis.title = element_text(face = "plain", size = 22),
    axis.text.x = element_text(size = 20, color = "black"),
    axis.text.y = element_text(size = 20, color = "black"),
    plot.title = element_blank(),
    plot.subtitle = element_blank(),
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box.background = element_rect(
      color = "black", linewidth = 0.5, fill = "white"
    ),
    legend.text = element_text(size = 22),
    legend.key.width = unit(1.5, "cm"),
    legend.key.height = unit(1, "cm"),
    legend.spacing.x = unit(0.2, "cm"),
    legend.box.margin = margin(t = 5, b = 5, l = 10, r = 10)
  ) +
  guides(
    shape = guide_legend(title = NULL, override.aes = list(size = 5)),
    color = guide_legend(title = NULL),
    linetype = guide_legend(title = NULL, override.aes = list(size = 1.5))
  )

# ------------------------------------------------------------------
# Display the figure in the R Markdown output
# ------------------------------------------------------------------
print(density_plot)

Figure 2

# Read user query frequency data
data <- read.csv(fre, stringsAsFactors = FALSE)

# ------------------------------------------------------------------
# Process data for users with query_count >= 20
#   - Group all query counts >= 20 into a single category ">=20"
#   - Aggregate user counts by query_count
# ------------------------------------------------------------------
data <- data %>%
  mutate(
    query_count = ifelse(
      query_count >= 20,
      ">=20",
      as.character(query_count)
    )
  ) %>%
  group_by(query_count) %>%
  summarize(user_count = sum(user_count))

# ------------------------------------------------------------------
# Ensure the x-axis (query_count) is ordered correctly
#   - 1 to 19 in ascending order
#   - Followed by the aggregated category ">=20"
# ------------------------------------------------------------------
data$query_count <- factor(
  data$query_count,
  levels = c(as.character(1:19), ">=20")
)

# ------------------------------------------------------------------
# Compute percentage of users in each query_count category
# ------------------------------------------------------------------
data <- data %>%
  mutate(
    ratio = user_count / sum(user_count) * 100
  )

# ------------------------------------------------------------------
# Plot bar chart using percentage values
# ------------------------------------------------------------------
plot <- ggplot(data, aes(x = query_count, y = ratio)) +
  geom_bar(
    stat = "identity",
    fill = "#0053c8",
    color = "black",
    width = 0.7
  ) +
  labs(
    x = "User's Total Query Counts",  # X-axis label
    y = "User Percentage (%)"         # Y-axis label
  ) +
  scale_y_continuous(
    breaks = seq(0, 40, 5),           # Y-axis tick marks (percentage)
    limits = c(0, 41)                 # Y-axis range
  ) +
  theme_classic(base_size = 16) +
  theme(
    text = element_text(family = "Times New Roman"),  # Global font setting
    axis.title = element_text(
      face = "plain",
      family = "Times New Roman",
      size = 24
    ),
    axis.text.x = element_text(
      size = 18,
      family = "Times New Roman",
      color = "black"
    ),
    axis.text.y = element_text(
      size = 20,
      family = "Times New Roman",
      color = "black"
    )
  )

# ------------------------------------------------------------------
# Display the bar chart in the R Markdown output
# ------------------------------------------------------------------
print(plot)

Figure 3

# Read three datasets
info_aware_data <- read.csv(info_aware_path, stringsAsFactors = FALSE)
info_acquisition_data <- read.csv(info_acquisition_path, stringsAsFactors = FALSE)
info_integration_data <- read.csv(info_integration_path, stringsAsFactors = FALSE)

# Add information type tags
info_aware_data$Info_Type <- "info_aware"
info_acquisition_data$Info_Type <- "info_acquisition"
info_integration_data$Info_Type <- "info_integration"

# Standardize column names
colnames(info_aware_data) <- c("x", "question_order_group", "value", "Info_Type")
colnames(info_acquisition_data) <- c("x", "question_order_group", "value", "Info_Type")
colnames(info_integration_data) <- c("x", "question_order_group", "value", "Info_Type")

# Combine data
data_combined <- rbind(info_aware_data, info_acquisition_data, info_integration_data)

# Set Info_Type as factor
data_combined$Info_Type <- factor(
  data_combined$Info_Type,
  levels = c("info_aware", "info_acquisition", "info_integration")
)

data_combined$Info_Type <- dplyr::recode(
  data_combined$Info_Type,
  "info_aware" = "Awareness",
  "info_acquisition" = "Acquisition",
  "info_integration" = "Integration"
)

# --- Top plot p_high ---
p_high <- ggplot(subset(data_combined, Info_Type == "Integration"),
                 aes(x = question_order_group, y = value)) +
  geom_line(color = "#008000", linewidth = 1, linetype = "dotdash") +
  geom_point(color = "#008000", size = 3.5, shape = 15) +
  scale_y_continuous(
    limits = c(0.7, 0.81),
    breaks = seq(0.70, 0.80, 0.05)
  ) +
  scale_x_continuous(
    breaks = c(1, 5, 10, 15, 20),
    labels = c("1", "5", "10", "15", "20")
  ) +
  theme_classic(base_size = 16) +
  theme(
    text = element_text(family = "Times New Roman"),
    axis.title.x = element_blank(),               # Hide title
    axis.text.x = element_blank(),  
    axis.ticks.x = element_blank(),
    axis.text.y = element_text(size = 20, color = "black"),  # Consistent with p_low
    axis.title.y = element_blank(),
    axis.line.x  = element_blank(),
    legend.position = "none"
  )

# --- Bottom plot p_low ---
p_low <- ggplot(data_combined, aes(
  x = question_order_group,
  y = value,
  color = Info_Type,
  linetype = Info_Type,
  shape = Info_Type
)) +
  geom_line(linewidth = 1) +
  geom_point(size = 3.5) +
  #labs(y = "Likelihood of Each Information Processing Type in User Queries") +
  labs(y = "Likelihood of Information Processing Tasks")+
  scale_y_continuous(
    breaks = seq(0.3, 0.55, 0.05),
    limits = c(0.3, 0.55)
  ) +
  scale_x_continuous(
    breaks = c(1, 5, 10, 15, 20),
    labels = c("1", "5", "10", "15", "20")
  ) +
  scale_color_manual(
    values = c(
      "Awareness" = "#0053c8",
      "Acquisition" = "#c11023",
      "Integration" = "#008000"
    ),
    guide = guide_legend(title = NULL)
  ) +
  scale_linetype_manual(
    values = c(
      "Awareness" = "solid",
      "Acquisition" = "longdash",
      "Integration" = "dotdash"
    ),
    guide = guide_legend(title = NULL)
  ) +
  scale_shape_manual(
    values = c(
      "Awareness" = 16,
      "Acquisition" = 17,  # Triangle
      "Integration" = 15
    ),
    guide = guide_legend(title = NULL)
  ) +
  labs(x = "User's Nth Query") +
  theme_classic(base_size = 16) +
  theme(
    text = element_text(family = "Times New Roman"),
    axis.text = element_text(size = 20, color = "black"),
    axis.title.x = element_text(size = 22),
    axis.title.y =  element_text(size = 22),  
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box.background = element_rect(color = "black", linewidth = 0.5, fill = "white"),
    legend.text = element_text(size = 22),
    legend.key.width = unit(1.5, "cm"),
    legend.key.height = unit(1, "cm"),
    legend.spacing.x = unit(0.2, "cm"),
    legend.box.margin = margin(t = 5, b = 5, l = 10, r = 10)
  )+  guides(
    shape = guide_legend(title = NULL,override.aes = list(size = 5)), 
    color = guide_legend(title = NULL),
    linetype = guide_legend(title = NULL,override.aes = list(size = 1.5))
  )

# Patch plots (not using guides = "collect" to avoid affecting the legend)
plot_stacked <- p_high / p_low +
  plot_layout(ncol = 1, heights = c(0.30, 0.70))

print(plot_stacked)

# --- Output version with axis break symbol (~) ---
#cairo_pdf("info_processing_ar.pdf", width = 12, height = 9)
#grid.draw(plot_stacked)
#grid.text("~", x = unit(0.099, "npc"), y = unit(0.742, "npc"),
#          gp = gpar(fontsize = 20, fontface = "bold"))
#dev.off()